### SCRIPT FOR:
###
### NEGLIGIBLE EFFECT OF POSTAL VOTING ON ELECTORAL TURNOUT:
### RESULTS FROM AN ULTIMATE TEST ON FINNS RESIDING ABROAD
###
### VERSION: 2024-FEB-09


rm(list=ls())
gc()

setwd("O:/")




# LIBRARIES ---------------------------------------------------------------
library("crayon", lib.loc = "N:/")
library("withr", lib.loc = "N:/")
library("ggplot2", lib.loc = "N:/")
library("backports", lib.loc = "N:/")
library("broom", lib.loc = "N:/")
library("sjPlot", lib.loc = "N:/")
library("haven", lib.loc = "N:/")
library("tzdb", lib.loc = "N:/")
library("readr", lib.loc = "N:/")
library("tidyr", lib.loc = "N:/")
library("RColorBrewer", lib.loc = "N:/")
library("labeling", lib.loc = "N:/")
library("farver", lib.loc = "N:/")
library("digest", lib.loc = "N:/")
library("ggalluvial", lib.loc = "N:/")
library("ggpubr", lib.loc = "N:/")
library("grid", lib.loc = "N:/")
library("cowplot", lib.loc = "N:/")
library("RColorBrewer", lib.loc = "N:/")
library("gridExtra", lib.loc = "N:/")
library("cli", lib.loc = "N:/")
library("utf8", lib.loc = "N:/")
library("stargazer", lib.loc = "N:/")
library("plm", lib.loc = "N:/")
library("zoo", lib.loc = "N:/")
library("lmtest", lib.loc = "N:/")
library("fixest", lib.loc = "N:/")
library("marginaleffects", lib.loc = "N:/")
library("stringr", lib.loc = "N:/")
library("dplyr", lib.loc = "N:/")
library("checkmate", lib.loc = "N:/")




# LOADING DATA ------------------------------------------------------------
data <- read_stata("D:/e19/custom-made/u1395_ulkos_vaali.dta")




# __ Data preparations ----------------------------------------------------
# Erasing individuals who appear more than once in 'data'.
data <- 
  data %>%
  group_by(shnro) %>%
  summarise(n = n()) %>%
  filter(n == 1) %>%
  left_join(data, by = "shnro") %>%
  select(-n)





# FIGURE 2: Alluvial - previous voting ------------------------------------

# _ Data preparation ------------------------------------------------------

# Subsetting relevant variables
data.alluvial <- data[c(1:9)]

# Transforming data frame: from wide to long
data.alluvial <- gather(data.alluvial, election, type_of_voting, ek_11:ep_19, factor_key = TRUE)

# Setting factor levels
data.alluvial$shnro <- as.factor(data.alluvial$shnro)
data.alluvial$election <- factor(data.alluvial$election, levels = c("ek_11", "pvi_12", "pvii_12", "ep_14", "ek_15", "pvi_18", "ek_19", "ep_19"))

# Defining 'voting', 'non-voting', 'postal voting', and 'non-eligibility'
data.alluvial$turnout_group <- NA
data.alluvial$turnout_group[data.alluvial$type_of_voting == "1"] <- "Did not vote"
data.alluvial$turnout_group[data.alluvial$type_of_voting == "2"] <- "Did not vote"
data.alluvial$turnout_group[data.alluvial$type_of_voting == "3"] <- "Voted"
data.alluvial$turnout_group[data.alluvial$type_of_voting == "4"] <- "Voted"
data.alluvial$turnout_group[data.alluvial$type_of_voting == "5"] <- "Voted"
data.alluvial$turnout_group[data.alluvial$type_of_voting == "6"] <- "Postal voter"
data.alluvial$turnout_group[data.alluvial$type_of_voting == "9"] <- "Not eligible to vote"
data.alluvial$turnout_group <- factor(data.alluvial$turnout_group, levels = c("Did not vote", "Voted", "Postal voter", "Not eligible to vote"))
#table(data.alluvial$turnout_group, useNA = "ifany")


# Subsetting postal voters (and their previous voting)
# NOTE: The resulting 'data.alluvial' object will include only postal voters
data.alluvial <- merge(data.alluvial, 
                       subset(data.alluvial, turnout_group == "Postal voter" & election == "ek_19")[c(1)], 
                       by = "shnro")

# Adding summary statistics (to be used as labels)
totals <- subset(data.frame(table(data.alluvial$turnout_group, data.alluvial$election)), Var2 == "pvi_18" | Var2 == "ek_15" | Var2 == "ek_19")
totals$Share <- round((totals$Freq/nrow(subset(data.alluvial, election == "ek_19" & turnout_group == "Postal voter")))*100, 1)
totals$label.perc <- paste0(format(totals$Share, nsmall = 1), "%")
totals$label.n <- paste0("(N = ", format(totals$Freq, 0, nsmall = 0, big.mark = ","), ")")
data.alluvial$turnout_group <- factor(data.alluvial$turnout_group, levels = c("Did not vote", "Voted", "Postal voter", "Not eligible to vote", "     "))
data.alluvial$turnout_group[data.alluvial$turnout_group == "Not eligible to vote" & data.alluvial$election == "pvi_18"] <- "     "




# _ Figure ----------------------------------------------------------------
ggpubr::ggarrange(
  grobTree(rectGrob(gp = gpar(fill = "grey90")), 
           textGrob("Voting in the 2015 Finnish Parliamentary election", hjust = 0.5, gp = gpar(col = "black", cex = 1, lineheight = 0.8))),
  
  ggplot(subset(data.alluvial, election == "ek_15" | election == "ek_19"), 
         aes(x = election, stratum = turnout_group, fill = turnout_group, alluvium = shnro)) +
    scale_fill_manual(values = brewer.pal(n = 9, name = "Set1")[c(1, 2, 9, 6)]) +
    geom_stratum(alpha = 0.65, width = 1, decreasing = FALSE) +
    scale_x_discrete(limits = c("ek_15", "ek_19"), 
                     labels = c("Parliamentary\nelections\n2015", "Parliamentary\nelections\n2019")) +
    scale_y_continuous(breaks = c(nrow(subset(data.alluvial, election == "ek_15" | election == "ek_19"))/2*seq(0, 1, 0.05)), 
                       labels = c("0%", "", "", "", "20%", "", "", "", "40%", "", "", "", "60%", "", "", "", "80%", "", "", "", "100%")) +
    labs(title = NULL, x = NULL, y = NULL) +
    coord_cartesian(xlim = c(0.5, 2.5), ylim = c(0, 4613), clip = "off", expand = FALSE) +
    
    # Labels
    # Postal voters
    annotate("text", 
             x = 2, 
             y = totals[11, "Freq"]/2, 
             label = "Postal voters", 
             size = 10/.pt, 
             vjust = -0.15, 
             hjust =  0.5) +
    
    annotate("text", 
             x = 2, 
             y = totals[11, "Freq"]/2, 
             label = paste(totals[11, "label.perc"],
                           totals[11, "label.n"],
                           sep = " "), 
             size = 10/.pt, 
             vjust = 1.15, 
             hjust = 0.5) +
    
    # Did not vote
    annotate("text", 
             x = 1, 
             y = totals[4, "Freq"] + totals[2, "Freq"] + (totals[1, "Freq"]/2), 
             label = "Did not vote", 
             size = 10/.pt, 
             vjust = -0.15, 
             hjust =  0.5) +
    
    annotate("text", 
             x = 1, 
             y = totals[4, "Freq"] + totals[2, "Freq"] + (totals[1, "Freq"]/2), 
             label = paste(totals[1, "label.perc"],
                           totals[1, "label.n"],
                           sep = " "), 
             size = 10/.pt, 
             vjust = 1.15, 
             hjust = 0.5) +
    
    # Voted
    annotate("text", 
             x = 1, 
             y = totals[4, "Freq"] + (totals[2, "Freq"]/2), 
             label = "Voted", 
             size = 10/.pt, 
             vjust = -0.15, 
             hjust =  0.5) +
    
    annotate("text", 
             x = 1, 
             y = totals[4, "Freq"] + (totals[2, "Freq"]/2), 
             label = paste(totals[2, "label.perc"],
                           totals[2, "label.n"],
                           sep = " "), 
             size = 10/.pt, 
             vjust = 1.15, 
             hjust = 0.5) +
    
    # Not eligible to vote
    annotate("text", 
             x = 1, 
             y = totals[4, "Freq"]/2, 
             label = "Not eligible to vote", 
             size = 7.5/.pt, 
             vjust = -0.15, 
             hjust =  0.5) +
    
    annotate("text", 
             x = 1, 
             y = totals[4, "Freq"]/2, 
             label = paste(totals[4, "label.perc"],
                           totals[4, "label.n"],
                           sep = " "), 
             size = 7.5/.pt, 
             vjust = 1.15, 
             hjust = 0.5) +
    
    # Specs
    theme(panel.grid.major.x = element_blank(), 
          panel.grid.minor.x = element_blank(),
          panel.grid.major.y = element_line(colour = "gray90", size = 0.25), 
          panel.grid.minor.y = element_blank(),
          panel.border = element_rect(fill = "transparent", color = "transparent"), 
          panel.background = element_rect(fill = "white", color = "transparent"),
          plot.background = element_rect(fill = "transparent", color = "black", size = 1.35/.pt),
          text = element_text(color = "black"),
          axis.ticks = element_blank(),
          legend.position = "none", 
          plot.margin = unit(c(0.5, 0.5, 0.5, 0.5),"cm")),
  
  nrow = 2, heights = c(1.25, 10)) +
  theme(panel.background = element_rect(fill = "white"), 
        plot.margin = unit(c(0.5, 0.5, 0.5, 0.5),"cm"))


# Saving the output
ggsave(filename = "O:/Fig 2 - postal voters - previous voting methods.png", width = 12.5, height = 17.5, units = "cm", dpi = 600)


# Interim cleanup
rm(data.alluvial, totals)





# FIGURE A1: Alluvial voting over time ------------------------------------
# NOTE: Supplementary figure to Figure 1, showcasing the voting of individuals 
#       across the 2011-2015-2019 elections

# _ Data preparation ------------------------------------------------------

# Subsetting relevant variables
data.alluvial <- data[c(1:4)]

# Transforming data frame: from wide to long
data.alluvial <- gather(data.alluvial, election, type_of_voting, ek_11:ek_19, factor_key = TRUE)

# Storing number of observations separately
n.alluvial.fig <- nrow(data.alluvial)

# Defining 'voting', 'non-voting', 'postal voting', and 'non-eligibility'
data.alluvial$turnout_group <- NA
data.alluvial$turnout_group[data.alluvial$type_of_voting == "1"] <- "Did not vote"
data.alluvial$turnout_group[data.alluvial$type_of_voting == "2"] <- "Did not vote"
data.alluvial$turnout_group[data.alluvial$type_of_voting == "3"] <- "Voted"
data.alluvial$turnout_group[data.alluvial$type_of_voting == "4"] <- "Voted"
data.alluvial$turnout_group[data.alluvial$type_of_voting == "5"] <- "Voted"
data.alluvial$turnout_group[data.alluvial$type_of_voting == "6"] <- "Postal voter"
data.alluvial$turnout_group[data.alluvial$type_of_voting == "9"] <- "Not eligible to vote"
data.alluvial$turnout_group <- factor(data.alluvial$turnout_group, levels = c("Postal voter", "Voted", "Did not vote", "Not eligible to vote"))
#table(data.alluvial$turnout_group, data.alluvial$election, useNA = "ifany")


# Adding summary statistics (to be used as labels)
data.alluvial <-
  data.alluvial %>%
  add_count(election, turnout_group) %>%
  rename(type_total = n) %>%
  add_count(election) %>%
  rename(election_total = n) %>%
  mutate(share = type_total/election_total) %>%
  mutate(label = paste0(round(share*100, 1), "% (N = ", format(type_total, trim = TRUE, big.mark = ","), ")"))




# _ Figure ----------------------------------------------------------------
# NOTE: The process of generating figure is split into two steps: 1. main
#       alluvial diagram stored as ggplot object, and 2. visual adjustments.
#       It is to lower the computational demands. Software tends to crash
#       during the first step. Hence, it is generated separately. If
#       successful, then it is possible to proceed to aesthetics adjustments.

fig <- ggplot(data.alluvial, aes(x = election, 
                                stratum = turnout_group, 
                                fill = turnout_group, 
                                alluvium = shnro,
                                label = turnout_group)) +
  geom_flow(width = 0.5) +
  geom_stratum(alpha = 0.65, width = 0.5)


fig +
  scale_fill_manual(values = brewer.pal(n = 9, name = "Set1")[c(9, 2, 1, 6)]) +
  scale_x_discrete(limits = c("ek_11", 
                              "ek_15", 
                              "ek_19"), 
                   labels = c("Parliamentary\nelections\n2011",
                              "Parliamentary\nelections\n2015", 
                              "Parliamentary\nelections\n2019")) +
  scale_y_continuous(breaks = c((n.alluvial.fig/3)*seq(0, 1, 0.1)), 
                     labels = c("0%", "", "20%", "", "40%", "", "60%", "", "80%", "", "100%")) +
  geom_text(data = subset(data.alluvial, turnout_group != "Postal voter"),
            aes(label = paste0(turnout_group, "\n", label)), stat = "stratum", size = 8/.pt, color = "black", alpha = 0.85, lineheight = 0.85) +
  geom_text(data = subset(data.alluvial, turnout_group == "Postal voter"),
            aes(x = 3,
                y = (n.alluvial.fig/3)-(4613/2),
                label = paste0(turnout_group, " ", label)), 
            size = 5.5/.pt, color = "black", alpha = 0.85, lineheight = 0.85) +
  coord_cartesian(xlim = c(0.75, 3.25), ylim = c(0, n.alluvial.fig/3), expand = FALSE, clip = "off") +
  labs(title = NULL, x = NULL, y = NULL) +
  theme(panel.grid.major.x = element_blank(), 
        panel.grid.minor.x = element_blank(),
        panel.grid.major.y = element_line(colour = "gray90"), 
        panel.grid.minor.y = element_line(colour = "gray90"),
        axis.ticks = element_blank(),
        panel.border = element_blank(), 
        strip.background = element_blank(), panel.background = element_blank(),
        legend.position = "none", axis.text.y.right = element_blank(),
        plot.margin = unit(c(0.5, 0.5, 0.5, 0.5),"cm"))



# Saving the output
ggsave(filename = "O:/Fig A1 - Alluvial - voting over time.png", width = 20, height = 15, units = "cm", dpi = 600)


# Interim cleanup
rm(data.alluvial, n.alluvial.fig, fig)






# INTERRUPTED TIME SERIES -------------------------------------------------

# _ Data preparations -----------------------------------------------------

# Transforming electoral turnout data: from wide to long
ele.data <-
  data %>%
  select(shnro:ek_19) %>%
  gather(election, type_of_voting, ek_11:ek_19, factor_key = TRUE) %>%
  mutate(year = ifelse(str_sub(election, -2) == "11", 2011,
                ifelse(str_sub(election, -2) == "15", 2015, 2019)))

# Transforming residency data: from wide to long
res.data <-  
  data %>%
  mutate(resident_all_FIN = ifelse(kirjoilla2011 == 1 &
                                     kirjoilla2015 == 1 &
                                     kirjoilla2019 == 1, 1,
                                   ifelse(kirjoilla2011 == 0 &
                                            kirjoilla2015 == 0 &
                                            kirjoilla2019 == 0, 0, NA))) %>%
  select(shnro, resident_all_FIN, kirjoilla2011, kirjoilla2015, kirjoilla2019) %>%
  gather(residency, resident_year_FIN, kirjoilla2011:kirjoilla2019, factor_key = TRUE) %>%
  mutate(year = ifelse(str_sub(residency, -2) == "11", 2011,
                       ifelse(str_sub(residency, -2) == "15", 2015, 2019)))

# Marging turnout and residency data
data_ITS <-
  full_join(ele.data, res.data, by = c("shnro", "year")) %>%
  select(shnro, year, type_of_voting, resident_all_FIN, resident_year_FIN)

# Data check
# table(data_ITS$resident_all_FIN, data_ITS$resident_year_FIN, useNA = "ifany")

# Interim cleanup
rm(ele.data, res.data)

# Adding timing variable
data_ITS <-
  data_ITS %>%
  mutate(time = ifelse(year ==  2011, 1,
                ifelse(year ==  2015, 2, 3)))

# Recoding turnout into binary variable (for logistic regression)
data_ITS <-
  data_ITS %>%
  mutate(turnout = ifelse(type_of_voting <= 2, 0,
                   ifelse(type_of_voting >= 3  & 
                          type_of_voting <= 6, 1, NA)))

# Defining 'residency' (stable during all three elections) as factor 
data_ITS$resident_all_FIN <- as.factor(data_ITS$resident_all_FIN)

# Defining 'year' as factor 
data_ITS$year <- factor(data_ITS$year, levels = c("2011", "2015", "2019"))

# Defining 'ID' (i.e., 'shnro') as factor for ID fixed effects
data_ITS$shnro <- as.factor(data_ITS$shnro)





# _ TABLE 1 ---------------------------------------------------------------

# Model 1: LPM
# NOTE: Linear probability model
m1.lpm.ITS.all <- feols(turnout ~  year * resident_all_FIN,
                        data = filter(data_ITS, !is.na(resident_all_FIN)),
                        cluster = "shnro")

# Summary
etable(m1.lpm.ITS.all)

# Number of unique individuals
data_ITS %>%
  filter(!is.na(resident_all_FIN)) %>%
  select(shnro) %>%
  unique() %>%
  nrow()




# _ TABLE B1 --------------------------------------------------------------
# NOTE: Robustness test of the Linear Probability Model 1 (from Table 1)
#       with generalized linear model (binomial GLM)

# Model B1: GLM
m1.glm.ITS.all <- feglm(turnout ~  year * resident_all_FIN,
                        family = "binomial",
                        data = filter(data_ITS, !is.na(resident_all_FIN)),
                        cluster = "shnro")

# Summary
etable(m1.glm.ITS.all)

# Number of unique individuals
data_ITS %>%
  filter(!is.na(resident_all_FIN)) %>%
  select(shnro) %>%
  unique() %>%
  nrow()

# Generating predicted probabilities
marginaleffects::marginaleffects(m1.glm.ITS.all, type = "response", variables = "year", se.fit = T) %>%
  select(type, year, resident_all_FIN, predicted) %>%
  unique()



# _ FIGURE 3 --------------------------------------------------------------
# NOTE: Visualization of the LPM Model 1 (Table 1)

# Regenerating Model 1 with more intuitive coefficients
m1.lpm.ITS.all <- feols(turnout ~  year : resident_all_FIN,
                        data = filter(data_ITS, !is.na(resident_all_FIN)),
                        cluster = "shnro")

# Check
# etable(m1.lpm.ITS.all)

# Generating predicted probabilities for Figure 3
fig3.pred <-
  rbind(
    data.frame(group     = "Voters residing in Finland",
               year      = 2019,
               estimate  = marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[1, "estimate"], 
               conf.low  = marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[1, "conf.low"],
               conf.high = marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[1, "conf.high"]),
    
    data.frame(group     = "Voters residing abroad",
               year      = 2011,
               estimate  = marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[2, "estimate"] + marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[1, "estimate"], 
               conf.low  = marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[2, "conf.low"] + marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[1, "conf.low"],
               conf.high = marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[2, "conf.high"] + marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[1, "conf.high"]),
    
    data.frame(group     = "Voters residing abroad",
               year      = 2015,
               estimate  = marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[3, "estimate"] + marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[1, "estimate"], 
               conf.low  = marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[3, "conf.low"] + marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[1, "conf.low"],
               conf.high = marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[3, "conf.high"] + marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[1, "conf.high"]),
    
    data.frame(group     = "Voters residing abroad",
               year      = 2019,
               estimate  = marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[4, "estimate"] + marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[1, "estimate"], 
               conf.low  = marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[4, "conf.low"] + marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[1, "conf.low"],
               conf.high = marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[4, "conf.high"] + marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[1, "conf.high"]),
    
    data.frame(group     = "Voters residing in Finland",
               year      = 2011,
               estimate  = marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[5, "estimate"] + marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[1, "estimate"], 
               conf.low  = marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[5, "conf.low"] + marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[1, "conf.low"],
               conf.high = marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[5, "conf.high"] + marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[1, "conf.high"]),
    
    data.frame(group     = "Voters residing in Finland",
               year      = 2015,
               estimate  = marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[6, "estimate"] + marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[1, "estimate"], 
               conf.low  = marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[6, "conf.low"] + marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[1, "conf.low"],
               conf.high = marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[6, "conf.high"] + marginaleffects::tidy(m1.lpm.ITS.all, conf.int = TRUE, conf.level = 0.95)[1, "conf.high"])
  )


# Adjusting factor levels (for order of the legend items)
fig3.pred$group <- factor(fig3.pred$group, levels = c("Voters residing in Finland", "Voters residing abroad"))


# Figure
ggplot(fig3.pred, aes(x = year, y = estimate, group = group, color = group)) +
  geom_hline(yintercept = seq(0.05, 0.95, 0.05), colour = "gray90", size = 0.2) +
  geom_hline(yintercept = seq(0.20, 0.80, 0.20), colour = "gray90", size = 0.5) +
  geom_vline(xintercept = 2017, linetype = "longdash", color = brewer.pal(n = 9, name = "Set1")[9]) +
  annotate("text", x = 2016.9, y = 0.945, hjust = 1, vjust = 0.5, label = "Adoption of\npostal voting", size = 8.5/.pt, color = brewer.pal(n = 9, name = "Set1")[9], lineheight = 0.9) +
  geom_point() +
  geom_line() +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.2) +
  scale_color_manual(values = brewer.pal(n = 9, name = "Set1")[c(1, 2)]) +
  scale_x_continuous(breaks = c(2011, 2015, 2019), 
                   labels = c("Parliamentary\nelections\n2011",
                              "Parliamentary\nelections\n2015", 
                              "Parliamentary\nelections\n2019")) +
  scale_y_continuous(breaks = seq(0, 1, 0.1), 
                     labels = c("0%", "", "20%", "", "40%", "", "60%", "", "80%", "", "100%")) +
  coord_cartesian(xlim = c(2010, 2020), ylim = c(0, 1), expand = FALSE, clip = "off") +
  labs(title = NULL, x = NULL, y = "Predicted probability of voting", color = NULL) +
  theme(panel.grid = element_blank(),
        panel.border = element_rect(fill = "transparent", color = "black"), 
        panel.background = element_rect(fill = "white", color = "transparent"),
        plot.background = element_rect(fill = "transparent", color = "transparent", size = 1.35/.pt),
        text = element_text(color = "black"),
        legend.key = element_blank(),
        legend.key.size = unit(0.65, "cm"),
        axis.ticks = element_blank(),
        plot.margin = unit(c(0.5, 0.5, 0.5, 0.5),"cm"))



# Saving the output
ggsave(filename = "O:/Fig 3 - interrupted time series - turnout probabilities.png", width = 20, height = 10, units = "cm", dpi = 600)



# Interim cleanup
rm(fig3.pred, m1.lpm.ITS.all, m1.glm.ITS.all)





# EFFECT HETEROGENEITY ----------------------------------------------------
# NOTE: This part of the code uses the data prepared under 'INTERRUPTED
#       TIME SERIES: Data preparations'. Make sure to run that part of
#       the code before proceeding the following estimation.


# Lagging the 'turnout' variable and filtering non-missing rows
# NOTE: feglm() may have 'panel.id' functionality to lag the variable.
#       But when used, marginaleffects::marginaleffects() returns an error.
data_ITS_EH <- 
  data_ITS %>%
  group_by(shnro) %>%
  mutate(lag_turnout = lag(turnout, order_by = time)) %>%
  filter(resident_all_FIN == 0 & !is.na(lag_turnout))

# Adjusting factor levels (to obtain year = 2015 as a reference, and
# year = 2019 as a separate group)
data_ITS_EH$year <- factor(data_ITS_EH$year, levels = c("2011", "2019", "2015"))

# Adjusting factor levels
data_ITS_EH$lag_turnout <- as.factor(data_ITS_EH$lag_turnout)



# _ TABLE C1 --------------------------------------------------------------
# NOTE: Estimating the turnout probability AMONG NON-RESIDENT citizens
#       conditional on their previous electoral participation.



# Model C1: Linear probability model
m1.lpm.EH.all <- feols(turnout ~  year * lag_turnout,
                       data = data_ITS_EH,
                       cluster = "shnro")

# Summary
etable(m1.lpm.EH.all)

# Number of unique individuals
data_ITS_EH %>%
  select(shnro) %>%
  unique() %>%
  nrow()




# _ TABLE C2 --------------------------------------------------------------
# NOTE: Robustness test replicating the results from the linear probability
#       model C1 in Table C1


# Model C2: Generalized linear regression (binomial logistic regression)
m1.glm.EH.all <- feglm(turnout ~  year * lag_turnout, 
                       data = data_ITS_EH,
                       cluster = "shnro")

# Summary
etable(m1.glm.EH.all)


# Number of unique individuals
data_ITS_EH %>%
  select(shnro) %>%
  unique() %>%
  nrow()


# Generating predicted probabilities
marginaleffects::marginaleffects(m1.glm.EH.all, type = "response", variables = "year", se.fit = T) %>%
  select(type, year, lag_turnout, predicted) %>%
  unique()



# _ FIGURE 4 --------------------------------------------------------------
# NOTE: Visualization of the LPM Model C1 (Table C1)

# Regenerating Model C1 with more intuitive coefficients
m1.lpm.EH.all <- feols(turnout ~  year : lag_turnout,
                       data = data_ITS_EH,
                       cluster = "shnro")

# Check
# etable(m1.lpm.EH.all)

# Generating predicted probabilities for Figure 3
fig4.pred <-
  rbind(
    data.frame(group     = "Previously voted",
               year      = 2015,
               estimate  = marginaleffects::tidy(m1.lpm.EH.all, conf.int = TRUE, conf.level = 0.95)[1, "estimate"], 
               conf.low  = marginaleffects::tidy(m1.lpm.EH.all, conf.int = TRUE, conf.level = 0.95)[1, "conf.low"],
               conf.high = marginaleffects::tidy(m1.lpm.EH.all, conf.int = TRUE, conf.level = 0.95)[1, "conf.high"]),
    
    data.frame(group     = "Previously abstained",
               year      = 2019,
               estimate  = marginaleffects::tidy(m1.lpm.EH.all, conf.int = TRUE, conf.level = 0.95)[2, "estimate"] + marginaleffects::tidy(m1.lpm.EH.all, conf.int = TRUE, conf.level = 0.95)[1, "estimate"], 
               conf.low  = marginaleffects::tidy(m1.lpm.EH.all, conf.int = TRUE, conf.level = 0.95)[2, "conf.low"] + marginaleffects::tidy(m1.lpm.EH.all, conf.int = TRUE, conf.level = 0.95)[1, "conf.low"],
               conf.high = marginaleffects::tidy(m1.lpm.EH.all, conf.int = TRUE, conf.level = 0.95)[2, "conf.high"] + marginaleffects::tidy(m1.lpm.EH.all, conf.int = TRUE, conf.level = 0.95)[1, "conf.high"]),
    
    data.frame(group     = "Previously abstained",
               year      = 2015,
               estimate  = marginaleffects::tidy(m1.lpm.EH.all, conf.int = TRUE, conf.level = 0.95)[3, "estimate"] + marginaleffects::tidy(m1.lpm.EH.all, conf.int = TRUE, conf.level = 0.95)[1, "estimate"], 
               conf.low  = marginaleffects::tidy(m1.lpm.EH.all, conf.int = TRUE, conf.level = 0.95)[3, "conf.low"] + marginaleffects::tidy(m1.lpm.EH.all, conf.int = TRUE, conf.level = 0.95)[1, "conf.low"],
               conf.high = marginaleffects::tidy(m1.lpm.EH.all, conf.int = TRUE, conf.level = 0.95)[3, "conf.high"] + marginaleffects::tidy(m1.lpm.EH.all, conf.int = TRUE, conf.level = 0.95)[1, "conf.high"]),
    
    data.frame(group     = "Previously voted",
               year      = 2019,
               estimate  = marginaleffects::tidy(m1.lpm.EH.all, conf.int = TRUE, conf.level = 0.95)[4, "estimate"] + marginaleffects::tidy(m1.lpm.EH.all, conf.int = TRUE, conf.level = 0.95)[1, "estimate"], 
               conf.low  = marginaleffects::tidy(m1.lpm.EH.all, conf.int = TRUE, conf.level = 0.95)[4, "conf.low"] + marginaleffects::tidy(m1.lpm.EH.all, conf.int = TRUE, conf.level = 0.95)[1, "conf.low"],
               conf.high = marginaleffects::tidy(m1.lpm.EH.all, conf.int = TRUE, conf.level = 0.95)[4, "conf.high"] + marginaleffects::tidy(m1.lpm.EH.all, conf.int = TRUE, conf.level = 0.95)[1, "conf.high"])
    )


# Adjusting factor levels (for order of the legend items)
#fig4.pred$group <- factor(fig4.pred$group, levels = c("Voters residing in Finland", "Voters residing abroad"))


# Figure
ggplot(fig4.pred, aes(x = year, y = estimate, group = group, color = group)) +
  geom_hline(yintercept = seq(0.05, 0.95, 0.05), colour = "gray90", size = 0.2) +
  geom_hline(yintercept = seq(0.20, 0.80, 0.20), colour = "gray90", size = 0.5) +
  geom_vline(xintercept = 2017, linetype = "longdash", color = brewer.pal(n = 9, name = "Set1")[9]) +
  annotate("text", x = 2016.9, y = 0.945, hjust = 1, vjust = 0.5, label = "Adoption of\npostal voting", size = 8.5/.pt, color = brewer.pal(n = 9, name = "Set1")[9], lineheight = 0.9) +
  geom_point() +
  geom_line() +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.2) +
  scale_color_manual(values = brewer.pal(n = 9, name = "Set1")[c(1, 2)]) +
  scale_x_continuous(breaks = c(2011, 2015, 2019), 
                     labels = c("Parliamentary\nelections\n2011",
                                "Parliamentary\nelections\n2015", 
                                "Parliamentary\nelections\n2019")) +
  scale_y_continuous(breaks = seq(0, 1, 0.1), 
                     labels = c("0%", "", "20%", "", "40%", "", "60%", "", "80%", "", "100%")) +
  coord_cartesian(xlim = c(2010, 2020), ylim = c(0, 1), expand = FALSE, clip = "off") +
  labs(title = NULL, x = NULL, y = "Predicted probability of voting", color = NULL) +
  theme(panel.grid = element_blank(),
        panel.border = element_rect(fill = "transparent", color = "black"), 
        panel.background = element_rect(fill = "white", color = "transparent"),
        plot.background = element_rect(fill = "transparent", color = "transparent", size = 1.35/.pt),
        text = element_text(color = "black"),
        legend.key = element_blank(),
        legend.key.size = unit(0.65, "cm"),
        axis.ticks = element_blank(),
        plot.margin = unit(c(0.5, 0.5, 0.5, 0.5),"cm"))



# Saving the output
ggsave(filename = "O:/Fig 4 - effect heterogeneity.png", width = 20, height = 10, units = "cm", dpi = 600)


# Interim cleanup
rm(m1.lpm.EH.all, m1.glm.EH.all, fig4.pred, data_ITS, data_ITS_EH)




# FIRST DIFFERENCE --------------------------------------------------------

# _ Data preparations -----------------------------------------------------

# Election data
ele_FD <-
  data %>%
  select(shnro, ek_11:ek_19) %>%
  na.omit() %>%
  gather(election, type_of_voting, ek_11:ek_19, factor_key = TRUE) %>%
  mutate(turnout = ifelse(type_of_voting <= 2, 0,
                          ifelse(type_of_voting >= 3  & 
                                   type_of_voting <= 6, 1, NA))) %>%
  mutate(year = ifelse(str_sub(election, -2) == "11", 2011,
                       ifelse(str_sub(election, -2) == "15", 2015, 2019))) 


# Residency data
res_FD <-
  data %>%
  select(shnro, kirjoilla2011, kirjoilla2015, kirjoilla2019) %>%
  na.omit() %>%
  gather(admin_year, resident_in_FIN, kirjoilla2011:kirjoilla2019, factor_key = TRUE) %>%
  mutate(year = ifelse(str_sub(admin_year, -2) == "11", 2011,
                       ifelse(str_sub(admin_year, -2) == "15", 2015, 2019)))


# Data merge
data_FD <-
  full_join(select(ele_FD, c(1, 5, 4)),
            select(res_FD, c(1, 4, 3)),
            by = c("shnro", "year")) %>%
  mutate(time = ifelse(year ==  2011, 0,
                       ifelse(year ==  2015, 1, 2)))

# Interim cleanup
rm(ele_FD, res_FD)


# __ Adjusting factor levels ----------------------------------------------
data_FD$resident_in_FIN <- factor(data_FD$resident_in_FIN, levels = c(1, 0))
data_FD$time <- factor(data_FD$time, levels = c(0, 1, 2))
data_FD$year <- factor(data_FD$year, levels = c("2011", "2015", "2019"))


# Calculating number of occurences of each individual
data_FD <-
  data_FD %>%
  na.omit() %>%
  group_by(shnro) %>%
  mutate(n = n()) 



# _ TABLE 2 ---------------------------------------------------------------
# NOTE: Estimated is a linear probability model

# MODEL 2: Effect of moving outside of country
m2.lpm.FD.all <- feols(turnout ~  resident_in_FIN  | year + shnro,
                      data = filter(data_FD, n == 3),
                      cluster = "shnro")

# MODEL 3: Effect of moving outside of each year
m3.lpm.FD.all <- feols(turnout ~  year : resident_in_FIN  | year + shnro,
                       data = filter(data_FD, n == 3),
                       cluster = "shnro")

# Summary
etable(m2.lpm.FD.all, m3.lpm.FD.all)

# Number of unique individuals
data_FD_2011_2015 %>%
  select(shnro, year, turnout) %>%
  pivot_wider(names_from = year, values_from = turnout) %>%
  na.omit() %>%
  select(shnro) %>%
  unique() %>%
  nrow()


# MODEL 3: Residency change between 2015-2019
m.15_19.lpm.FD.all <- feols(turnout ~  time * resident_in_FIN | time + shnro,
                            data = data_FD_2015_2019,
                            cluster = "shnro")

# Number of unique individuals
data_FD_2015_2019 %>%
  select(shnro, year, turnout) %>%
  pivot_wider(names_from = year, values_from = turnout) %>%
  na.omit() %>%
  select(shnro) %>%
  unique() %>%
  nrow()


# Summary
etable(m.11_15.lpm.FD.all,
       m.15_19.lpm.FD.all)



# FIGURE REMAINS THE SAME AS BEFORE, SO THE CODE WILL BE UPDATED LATER




# _ TABLE D1 --------------------------------------------------------------
# NOTE: Estimated are linear probability models

# MODEL D1: Residency change between 2011-2015
m.11_15.glm.FD.all <- feglm(turnout ~  time + resident_in_FIN | time + shnro,
                            family = "binomial",
                            data = data_FD_2011_2015,
                            cluster = "shnro")

# Number of unique individuals
data_FD_2011_2015 %>%
  select(shnro, year, turnout) %>%
  pivot_wider(names_from = year, values_from = turnout) %>%
  na.omit() %>%
  select(shnro) %>%
  unique() %>%
  nrow()


# MODEL D2: Residency change between 2015-2019
m.15_19.glm.FD.all <- feglm(turnout ~  time * resident_in_FIN | time + shnro,
                            family = "binomial",
                            data = data_FD_2015_2019,
                            cluster = "shnro")

# Number of unique individuals
data_FD_2015_2019 %>%
  select(shnro, year, turnout) %>%
  pivot_wider(names_from = year, values_from = turnout) %>%
  na.omit() %>%
  select(shnro) %>%
  unique() %>%
  nrow()


# Summary
etable(m.11_15.glm.FD.all,
       m.15_19.glm.FD.all)


# Generating predicted probabilities
marginaleffects::marginaleffects(m.11_15.glm.FD.all, type = "response", variables = "time", se.fit = T) %>%
  select(type, time, resident_in_FIN, predicted) %>%
  unique()
